library(muhaz) kphaz.plot <- function(fit, minx=0, miny=0, maxx=0, maxy=0, step=FALSE, smooth=FALSE, points=FALSE, ...) { ######################################################################## # "kphaz.plot" is an S function plots the K-M-type hazard # function estimate generated by kphaz.fit. # Required Arguments: # fit: results of a call to kphaz.fit. # ...: additional arguments for the plot function ######################################################################## # # Make sure time and haz exist and are the same length # if(any(names(fit) == "time") & (any(names(fit) == "haz"))) { time <- fit$time haz <- fit$haz } else stop("Arguement \"fit\" must be the result of a call to \"kphaz.fit\"" ) # if(length(time) != length(haz)) stop( "Arguement \"fit\" must be the result of a call to \"kphaz.fit\"" ) # # Check to see if there are any strata # qstrata <- any(names(fit) == "strata") if(qstrata) strata <- fit$strata else strata <- rep(1, length(time)) if(length(strata) != length(haz)) stop( "Arguement \"fit\" must be the result of a call to \"kphaz.fit\"" ) # # Define, ustrata, the number of unique strata # ustrata <- unique(strata) good <- 1:length(ustrata) for(i in 1:length(ustrata)) { cur.strata <- ustrata[i] ind <- strata == ustrata[i] if(all(is.na(haz[ind] | is.nan(haz[ind])))) good <- good[good != i] } ustrata <- ustrata[good] # # Check to see if there are any plots to be made # if(length(ustrata) < 1) stop("No plots") # # Make the first plot # # Gary Feng: add maxx and maxy ind <- (strata == ustrata[1]) & (!is.nan(haz)) # xmax <- max(time) # ymax <- max(haz[((!is.nan(haz)) & (!is.na(haz)))]) if(missing(maxx) | maxx==0){ xmax <- max(time) } else { xmax <-maxx } if(missing(maxy) | maxy==0) { ymax <- max(haz[((!is.nan(haz)) & (!is.na(haz)))]) } else { ymax <- maxy } ## End Gary Feng x <- time[ind] y <- haz[ind] if(min(x) > 0) { x <- c(0, x) y <- c(0, y) } # Gary Feng: empty plot for now # plot(x, y, xlim = c(0, xmax), ylim = c(0, ymax), xlab = "Time", # ylab = "Hazard", type = "s", ...) # plot(x, y, xlim = c(minx, xmax), ylim = c(miny, ymax), xlab = "Time", ylab = "Hazard", type = "n", ...) # # Make any subsequent plots # # if(length(ustrata) > 1) { # for(i in 2:length(ustrata)) { if(length(ustrata) >= 1) { for(i in 1:length(ustrata)) { # ind <- strata == ustrata[i] ind <- (strata == ustrata[i]) & (!is.nan(haz)) & (!is.na(haz)) x <- time[ind] # Gary Feng: the function gives error message: # Error in stepfun(x, y) : 'y' must be one longer than 'x' # adding a zero at the beginning if (step==TRUE) { y <- c(0, haz[ind]) } else { y <- haz[ind] } if(min(x) > 0) { x <- c(0, x) y <- c(0, y) } # Gary Feng: use lines, not steps if (step==TRUE) { lines(stepfun(x, y), lty = i, col=i) } else { if (points) points(x,y,col=i) # Gary Feng: Smooth data if needed if (smooth==TRUE) { y<-smooth(y) } lines(x,y, col=i, lty = i) # lines(x,y, col="light gray", lty = 1) } # end Gary Feng } } # # Return # invisible() } ############################################### # kphaz.plot(out, maxy=0.02, smooth=TRUE) # segmented library(segmented) #################################### #dundee<-read.table("dundee.dat", header=TRUE) #dundee$logfreq<-log1p(dundee$TXFR) load("dundee.rda"); #################################### # reg vs refix #################################### # bin width binwidth=5 dundee$fixdur<-round(dundee$FDUR/binwidth)*binwidth status<-rep(1,length(dundee$fixdur)) # regression vs refixations par(mfrow=c(1,1)) ind<-(dundee$isFFD==0 & dundee$FDUR>60) out1<-kphaz.fit(dundee$fixdur[ind],status[ind],dundee$isReg[ind]) kphaz.plot(out1, maxx=600, maxy=0.03, smooth=TRUE) us=unique(out1$strata) initpara<-vector("list", length(us)); for (j in 1:length(us)) { initpara[[j]]<-c(100,180,250); } #initpara[[1]]<-c(120, 200, 250) #initpara[[2]]<-c(130, 180, 250) par(mfrow=c(1, 1)) maxtime=500 plot(out1$time, out1$haz, , xlim=c(0,maxtime), ylim=c(0,0.02),xlab="Fixation Duration", ylab="Hazard rate",type="n") for (i in 1:length(us)) { ind<-(out1$strata==us[i] & out1$time<=maxtime & out1$time>=60 & out1$haz<0.017) # tt<-smooth(out1$haz[ind]) d<-data.frame(y=out1$haz[ind], x=out1$time[ind]) h.lm<-lm(y~x,data=d) summary(h.lm) h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara[[i]]),control=seg.control(display=FALSE,h=0.5)) summary(h.seg1) slope(h.seg1) plot(h.seg1, add=TRUE, col=i) points(d$x, d$y, col=i, pch=i, cex=0.5) lines(h.seg1, term="x", col=i, pch=i, k=i*3+10) } ################################### # FFD, REFIX, & REG ################################### # bin width binwidth=1 dundee$fixdur<-round(dundee$FDUR/binwidth)*binwidth status<-rep(1,length(dundee$fixdur)) par(mfrow=c(1,1)) # compare fix type, combine indicators fixtype<-dundee$isFFD*1+dundee$isRefix*2+dundee$isReg*3 # because there are some "4"s and 1 "0" ind<-(fixtype>0 & fixtype<4) ##################### Desc STATS ########## # number of cases table(round(fixtype[ind])) # summary by(dundee$fixdur[ind], fixtype[ind], summary) out1<-kphaz.fit(dundee$fixdur[ind],status[ind],fixtype[ind]) # kphaz.plot(out1, maxx=600, maxy=0.03, smooth=TRUE) ############################### outliers<-c(61.5, 103.5, 145.5, 187.5, 226.5, 241.5, 268.5, 310.5, 352.5, 394.5, 415.5, 436.5, 478.5, 502.5, 544.5) # new outliers in this analysis outliers<-c(outliers, c(228.5, 242.5, 270.5, 312.5)) for (t in outliers) { out1$haz[out1$time==t]<- NA } ##### ##### us=unique(out1$strata) initpara<-vector("list", length(us)); for (j in 1:length(us)) { initpara[[j]]<-c(100, 180, 250); } #initpara[[1]]<-c(120, 180, 250) #initpara[[2]]<-c(120, 180) #initpara[[3]]<-c(120, 180, 250) par(mfrow=c(1, 1)) maxtime=450 mintime=60 plot(out1$time, out1$haz, , xlim=c(50,maxtime), ylim=c(-.005,0.02),xlab="Fixation Duration", ylab="Hazard rate",type="n") leg.txt<- c("Forward", "Refixation", "Regression") legend(mintime, 0.015, leg.txt, pch=1:length(us), col=1:length(us)) # add horizontal line at 0,0 lines(c(0,maxtime), c(0,0)) for (i in 1:length(us)) { ind<-(out1$strata==us[i] & out1$time<=maxtime & out1$time>=60 & out1$haz<0.02 & !is.na(out1$haz)) # tt<-smooth(out1$haz[ind]) d<-data.frame(y=out1$haz[ind], x=out1$time[ind]) h.lm<-lm(y~x,data=d) summary(h.lm) #points(d$x, d$y, col=i, pch=i, cex=0.5) h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara[[i]]),control=seg.control(display=FALSE,h=0.5)) print(summary(h.seg1)) print(slope(h.seg1)) plot(h.seg1, add=TRUE, col=i, lwd=2) lines(h.seg1, term="x", col=i, pch=i, k=i*3+10) lines(d$x, smooth(smooth(d$y)), col=i) } ######################################## ################################### # Individual Differences ################################### # bin width binwidth=3 dundee$fixdur<-round(dundee$FDUR/binwidth)*binwidth status<-rep(1,length(dundee$fixdur)) par(mfrow=c(1,1)) ind<-(dundee$SUBJ>=1 & dundee$SUBJ<=10) ##################### Desc STATS ########## # number of cases table(round(dundee$SUBJ[ind])) # summary by(dundee$fixdur[ind], dundee$SUBJ[ind], summary) out1<-kphaz.fit(dundee$fixdur[ind],status[ind],dundee$SUBJ[ind]) kphaz.plot(out1, maxx=600, maxy=0.03, smooth=TRUE) ############################### outliers<-c(61.5, 103.5, 145.5, 187.5, 226.5, 241.5, 268.5, 310.5, 352.5, 394.5, 415.5, 436.5, 478.5, 502.5, 544.5) # new outliers in this analysis outliers<-c(outliers, c(228.5, 242.5, 270.5, 312.5)) for (t in outliers) { out1$haz[out1$time==t]<- NA } ##### us=unique(out1$strata) initpara<-vector("list", length(us)); for (j in 1:length(us)) { initpara[[j]]<-c(120, 180, 250); } initpara[[10]]<-c(120, 250) initpara[[5]]<-c(120, 250) #initpara[[3]]<-c(120, 170, 230) initpara[[4]]<-c(120, 250) par(mfrow=c(1, 1)) maxtime=400 plot(out1$time, out1$haz, , xlim=c(50,maxtime), ylim=c(-0.005,0.03),xlab="Fixation Duration", ylab="Hazard rate",type="n") log.txt<-c("sa", "sb", "sc", "sd", "se", "sf", "sg", "sh", "si", "sj") legend(60, 0.025, log.txt, pch=1:length(us), col=1:length(us)) # add horizontal line at 0,0 lines(c(0,maxtime), c(0,0)) for (i in 1:length(us)) { ind<-(out1$strata==us[i] & out1$time<=maxtime & out1$time>=60 & !is.na(out1$haz)) d<-data.frame(y=out1$haz[ind], x=out1$time[ind]) h.lm<-lm(y~x,data=d) summary(h.lm) h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara[[i]]),control=seg.control(display=FALSE,h=0.5)) print(summary(h.seg1)) print(slope(h.seg1)) points(d$x, d$y, col=i, pch=i, cex=0.5) plot(h.seg1, add=TRUE, col=i, lwd=2) lines(h.seg1, term="x", col=i, pch=i, k=i*3+10) lines(d$x, smooth(d$y), col=i) } ######################################## ################################### # Word Len: Use FFD ONLY ################################### # bin width binwidth=1 dundee$fixdur<-round(dundee$FDUR/binwidth)*binwidth status<-rep(1,length(dundee$fixdur)) # Use only FFD status<-dundee$isFFD par(mfrow=c(1,1)) ind<-(dundee$WLEN>=3 & dundee$WLEN<9) ##################### Desc STATS ########## # number of cases table(round(dundee$WLEN[ind])) # summary by(dundee$fixdur[ind], round(dundee$WLEN[ind]), summary) out1<-kphaz.fit(dundee$fixdur[ind],status[ind],dundee$WLEN[ind]) kphaz.plot(out1, maxx=600, maxy=0.03, smooth=TRUE) kphaz.plot(out1, maxx=600, maxy=0.03) print("===========================================================================") print("We take out outliers") outliers<-c(61.5, 103.5, 145.5, 187.5, 226.5, 241.5, 268.5, 310.5, 352.5, 394.5, 415.5, 436.5, 478.5, 502.5, 544.5) for (t in outliers) { out1$haz[out1$time==t]<- NA } print("===========================================================================") kphaz.plot(out1, maxx=400, maxy=0.03) ind<-(!is.na(out1$haz)) temp$haz[ind]=smooth(smooth(smooth(temp$haz[ind]))) kphaz.plot(temp, maxx=400, maxy=0.03) ###################################### us=unique(out1$strata) initpara<-vector("list", length(us)); for (j in 1:length(us)) { initpara[[j]]<-c(120, 180, 250); } #initpara[[6]]<-c(120, 250) #initpara[[5]]<-c(120, 250) #initpara[[4]]<-c(120, 250) #initpara[[1]]<-c(120, 180, 240) par(mfrow=c(1, 1)) mintime=50 maxtime=400 plot(out1$time, out1$haz, , xlim=c(mintime,maxtime), ylim=c(-0.005,0.02),xlab="Fixation Duration", ylab="Hazard rate",type="n") leg.txt<- c("3 letters", "4 letters", "5 letters", "6 letters", "7 letters", "8 letters") #legend(10, 0.015, leg.txt, pch=1:6, col=1:6) legend(mintime, 0.015, leg.txt, pch=1:length(us), col=1:length(us)) # add horizontal line at 0,0 lines(c(0,maxtime), c(0,0)) for (i in 1:length(us)) { # # ind<-(out1$strata==us[i] & out1$time<=maxtime & out1$time>=60 & #out1$time!=187.5 & out1$time!=226.5 & out1$time!=241.5 & out1$time!=268.5 #& out1$haz<0.018) ind<-(out1$strata==us[i] & out1$time<=maxtime & out1$time>=mintime & !is.na(out1$haz) & out1$haz<0.017) d<-data.frame(y=out1$haz[ind], x=out1$time[ind]) h.lm<-lm(y~x,data=d) summary(h.lm) h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara[[i]]), control=seg.control(display=FALSE,h=0.5)) print(summary(h.seg1)) print(slope(h.seg1)) # points(d$x, d$y, col=i, pch=i, cex=0.5) plot(h.seg1, add=TRUE, col=i, lwd=2) lines(h.seg1, term="x", col=i, pch=i, k=i*3+10) lines(d$x, smooth(smooth(d$y)), col=i) } ######################################## ################################### # Word Freq: Use First Fixation Duration ################################### # bin width binwidth=1 dundee$fixdur<-round(dundee$FDUR/binwidth)*binwidth status<-rep(1,length(dundee$fixdur)) # Use only FFD status<-dundee$isFFD par(mfrow=c(1,2)) # excluding logfreq=8, most likely "the", "of", etc. # ind1<-(dundee$logfreq>=1 & dundee$logfreq<=7 & !is.na(dundee$logfreq)) ind1<-(dundee$logfreq>=1 & dundee$logfreq<=7 & !is.na(dundee$logfreq) & dundee$WLEN>3 & dundee$WLEN<8) ##################### Desc STATS ########## # number of cases table(round(dundee$logfreq[ind1])) # summary by(dundee$fixdur[ind1], round(dundee$logfreq[ind1]), summary) ##################### ##################### FIT hazard ############# out1<-kphaz.fit(dundee$fixdur[ind1],status[ind1],round(dundee$logfreq[ind1])) kphaz.plot(out1, maxx=600, maxy=0.03, smooth=TRUE, points=FALSE) print("===========================================================================") outliers<-c(61.5, 103.5, 145.5, 187.5, 226.5, 241.5, 268.5, 310.5, 352.5, 394.5, 415.5, 436.5, 478.5, 502.5, 544.5) # new outliers in this analysis outliers<-c(outliers, c(228.5, 242.5, 270.5, 312.5)) for (t in outliers) { out1$haz[out1$time==t]<- NA } print("===========================================================================") kphaz.plot(out1, maxx=400, maxy=0.03) ind<-(!is.na(out1$haz)) temp$haz[ind]=smooth(smooth(smooth(temp$haz[ind]))) kphaz.plot(temp, maxx=400, maxy=0.03) # Now do the segmented regression us=unique(out1$strata) initpara<-vector("list", length(us)); for (j in 1:length(us)) { initpara[[j]]<-c(120, 180, 250); } #initpara[[5]]<-c(120, 250) #initpara[[3]]<-c(120, 170, 230) initpara[[7]]<-c(120, 180) initpara[[6]]<-c(120, 180) par(mfrow=c(1, 1)) maxtime=400 mintime=50 plot(out1$time, out1$haz, , xlim=c(50,maxtime), ylim=c(-0.005,0.025),xlab="Fixation Duration", ylab="Hazard rate",type="n") leg.txt<- ceiling(exp(1:length(us))) leg.text<-c("3-7", "8-20", "21-50", "51-150", "151-400", "401-1000", "1000-3000") legend(mintime, 0.020, leg.text, pch=1:length(us), col=1:length(us)) # add horizontal line at 0,0 lines(c(0,maxtime), c(0,0)) for (i in length(us):1) { # there are some wierd time points that generate high hazard rates for pretty much all strata # sort(unique(out1$time[out1$haz>0.18 & out1$time<300])) # ind<-(out1$strata==us[i] & out1$time<=maxtime & out1$time>=60 & !is.na(out1$haz)) # tt<-smooth(out1$haz[ind]) d<-data.frame( x=out1$time[ind], y=out1$haz[ind]) #points(d$x, d$y, col=i, pch=i, cex=0.5) h.lm<-lm(y~x,data=d) summary(h.lm) h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara[[i]]),control=seg.control(display=FALSE,h=0.5)) print(summary(h.seg1)) print(slope(h.seg1)) #points(d$x, d$y, col=i, pch=i, cex=0.5) plot(h.seg1, add=TRUE, col=i, lwd=2) lines(h.seg1, term="x", col=i, pch=i, k=i*3+10) lines(d$x, smooth(smooth(d$y)), col=i) } ######################################## ################################### # Overall distribution ################################### # bin width binwidth=1 dundee$fixdur<-round(dundee$FDUR/binwidth)*binwidth status<-rep(1,length(dundee$fixdur)) out1<-kphaz.fit(dundee$fixdur,status) out1$hazraw<-out1$haz # The Dr. Bouis seems to have a peoridic of 41~42ms where a peak in hazard will occur outliers<- sort( c( sort(unique(out1$time[out1$haz>0.020 & out1$time<600])), sort(unique(out1$time[out1$haz>0.009 & out1$time<160])), sort(unique(out1$time[out1$haz>0.005 & out1$time<110])), sort(unique(out1$time[out1$haz>0.003 & out1$time<80])) )) print("The following hazard rates are likely to be outliers due to the eye tracker") print(outliers) print("They come in at a cycle of 41~42msec, or ~24Hz") print(diff(outliers)) print(1000/diff(outliers)) print("We censor them out by setting the hazard rate to NA") for (t in outliers) { out1$haz[out1$time==t]<- NA } # Now do the segmented regression us=unique(out1$strata) initpara<-vector("list", length(us)); initpara[[1]]<-c(120, 180, 250, 350); initpara[[2]]<-c(120, 180, 250); initpara[[3]]<-c(120, 250); initpara[[4]]<-c(120, 180); par(mfrow=c(1, 1)) maxtime=650 mintime=60 plot(out1$time, out1$haz, , xlim=c(50,maxtime), ylim=c(-.005,0.03),xlab="Fixation Duration", ylab="Hazard rate/Prob. density",type="n") #leg.txt<- "All fixations in Dundee Corpus" #legend(mintime, 0.025, leg.txt, pch=1:length(us), col=1:length(us)) ind<-(out1$time<=maxtime & out1$time>=mintime & !is.na(out1$haz)) d<-data.frame( x=out1$time[ind], y=out1$haz[ind]) h.lm<-lm(y~x,data=d) summary(h.lm) lines(out1$time, out1$hazraw, col="light gray") lines(d$x, smooth(d$y), col=1) for (i in 1:length(initpara)) { h.seg1<-segmented(h.lm,seg.Z=~x,psi=list(x=initpara[[i]]),control=seg.control(display=FALSE,h=0.5)) print(summary(h.seg1)) print(slope(h.seg1)) plot(h.seg1, add=TRUE, col=i) lines(h.seg1, term="x", col=i, pch=1, k=i*3+10) } # add pdf: use a large high limit for breakpoints, else the pdf will reflect the censoring. fixhist<-hist(dundee$fixdur[dundee$fixdur<2000], 1:2000, plot=FALSE) histtime<-c(0, fixhist$breaks[1:maxtime]) histpdf<-c(0, fixhist$density[1:maxtime]) lines(histtime,histpdf , col="pink") # add horizontal line at 0,0 lines(c(0,maxtime), c(0,0)) ###################################### # for language data, create a "out1" data frame ################################### # Landpos x WordLen ################################### # calculate landing position dundee$landpos<-dundee$WDLP/dundee$WLEN dundee$isOVP<-rep(0, length(dundee$FDUR)) dundee$isOVP[dundee$landpos>0.2 & dundee$landpos<0.7] <- 1 par(mfrow=c(2,3)) for(wl in 5:10) { ind<-dundee$WLEN==wl out2<-kphaz.fit(dundee$fixdur[ind],status[ind],dundee$isOVP[ind]) kphaz.plot(out2, maxx=600, maxy=0.03, smooth=TRUE) } # smooth the hazar lines # rough, as it will mess up when switching between strata hazz<-smooth(out$haz) plot(out$time, hazz, type="n") lines(out$time, hazz)